We want to look at real datasets to simulate realistic datasets.
data("allen")
prefilter <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
which(colData(allen)$Core.Type=="Core")]
filterGenes <- apply(assay(prefilter) > 10, 1, sum) >= 10
sampleCells <- sample(ncol(prefilter), 100)
postfilter <- prefilter[filterGenes, sampleCells]
bio <- as.factor(colData(postfilter)$driver_1_s)
cl <- as.factor(colData(postfilter)$Primary.Type)
postfilter <- assay(postfilter)
vars <- rowVars(log1p(postfilter))
names(vars) <- rownames(postfilter)
vars <- sort(vars, decreasing = TRUE)
core <- postfilter[names(vars)[1:1000],]
detection_rate <- colSums(core>0)
coverage <- colSums(core)
We will color-code the cells by either known cell type or by inferred cluster (inferred in the original study). Remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells, sample at random 100 cells, select 1,000 most variable genes. Then, dimensions are
dim(core)
## [1] 1000 100
par(mfrow = c(1,2))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, rowMeans(core == 0), xlim = c(3,8), ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
main = '1,000 most variable genes')
smoothScatter(mean, rowMeans(core == 0), xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = '1,000 most variable genes')
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts")
#legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts")
Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
commondispersion = FALSE)))
## user system elapsed
## 268.035 143.760 162.351
Our true W is
par(mfrow=c(1, 2))
plot(zinb@W, col=cols[bio], pch=19, xlab="W1", ylab="W2",
main="color = known cell-type")
#legend("bottomright", levels(bio), fill=cols)
plot(zinb@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2",
main="color = inferred clusters")
#legend("topright", levels(cl), fill=cols2)
true_W <- zinb@W
Gamma_mu and gamma_pi are not correlated in this dataset. Let’s simulate \[\gamma_{\mu} \sim \mathcal{N}(5, .5)\] \[\gamma_{\pi} \sim \mathcal{N}(-1, 2.5)\]
gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]
c(mean = mean(gamma_mu), sd = sd(gamma_mu))
## mean sd
## 4.9465718 0.3887733
c(mean = mean(gamma_pi), sd = sd(gamma_pi))
## mean sd
## -0.8150281 0.8255717
true_gamma_mu = rnorm(100, 5, 0.4)
true_gamma_pi = rnorm(100, -.8, .8)
true_gamma = data.frame(true_gamma_mu = true_gamma_mu,
true_gamma_pi = true_gamma_pi,
celltype = bio)
true_gamma = melt(true_gamma)
df <- data.frame(gamma_mu = gamma_mu,
gamma_pi = gamma_pi,
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.1824662 0.2365889 0.8958176
## gamma_pi -0.1824662 1.0000000 -0.9914158 -0.4572457
## detection_rate 0.2365889 -0.9914158 1.0000000 0.5103829
## coverage 0.8958176 -0.4572457 0.5103829 1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi,
true_gamma_mu = true_gamma_mu, true_gamma_pi = true_gamma_pi,
celltype = bio)
gamma = melt(gamma)
gamma$status = sapply(gamma$variable, function(x) ifelse(grepl('true', x), 'simulated', 'fitted'))
gamma$variable = gsub('true_', '', gamma$variable)
ggplot(gamma, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(status ~ variable, scale = 'free_x') + xlab('gamma')
par(mfrow=c(1,2))
qqplot(gamma_mu, true_gamma_mu, main = 'QQplot - gamma_mu')
abline(a=0,b=1)
qqplot(gamma_pi, true_gamma_pi, main = 'QQplot - gamma_pi')
abline(a=0,b=1)
ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
geom_boxplot() + coord_flip() +
facet_grid(status~ variable, scales = 'free_x') +
theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
facet_grid(status ~ variable) + xlab('gamma') + theme_bw()
rmixtNorm = function(n, m1, sd1, m2, sd2, p = .95, seed = NULL){
if (!is.null(seed)) set.seed(seed)
temp = cbind(rnorm(n, m1, sd1), rnorm(n, m2, sd2))
id = sample(1:2, n, rep = T, prob = c(p,1-p))
id = cbind(1:n,id)
temp[id]
}
beta_mu and beta_pi are not correlated in the dataset. To simulate the long tail of the distributions of beta, let’s simulate beta_mu and beta_pi as a mixture of gaussians \[\beta_{\mu} \sim .9 \mathcal{N}(.8, .2) + .1 \mathcal{N}(0, 1)\] \[\beta_{\pi} \sim .9 \mathcal{N}(-1, 5) + .1 \mathcal{N}(-25, 10)\]
beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]
c(mean = mean(beta_mu), sd = sd(beta_mu))
## mean sd
## 0.9923809 0.4641701
c(mean = mean(beta_pi), sd = sd(beta_pi))
## mean sd
## -1.700859 6.031535
true_beta_mu = rmixtNorm(1000, .8, .4, 0, 1, seed = 123)
true_beta_pi = rmixtNorm(1000, -.4, .8, -25, 20, seed = 456)
df <- data.frame(beta_mu = beta_mu,
beta_pi = beta_pi)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.2587559
## beta_pi -0.2587559 1.0000000
true_beta = data.frame(true_beta_mu = true_beta_mu, true_beta_pi = true_beta_pi)
pairs(true_beta, pch=19)
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.2587559
## beta_pi -0.2587559 1.0000000
beta = data.frame(beta_mu = beta_mu, beta_pi = beta_pi,
true_beta_mu = true_beta_mu,
true_beta_pi = true_beta_pi)
beta = melt(beta)
beta$status = sapply(beta$variable, function(x) ifelse(grepl('true', x), 'simulated', 'fitted'))
beta$variable = gsub('true_', '', beta$variable)
ggplot(beta, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 50, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(status ~ variable, scales = 'free') + xlab('beta')
par(mfrow=c(1,2))
qqplot(beta_mu, true_beta_mu, main = 'QQplot - beta_mu')
abline(a=0,b=1)
qqplot(beta_pi, true_beta_pi, main = 'QQplot - beta_pi')
abline(a=0,b=1)
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot() + facet_grid(status ~ variable) +
coord_flip() +
theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)
# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
geom_boxplot(outlier.shape = NA) +
facet_grid(status ~ variable, scales = 'free') + coord_flip() +
theme_bw() + ylab('beta removing outliers') +
scale_x_discrete(breaks = c('', ''), drop=FALSE) +
scale_y_continuous(limits = c(min, max))
## Warning: Removed 1919 rows containing non-finite values (stat_boxplot).
alpha_mu and alpha_pi are not correlated.
par(mfrow=c(1,2))
plot(t(zinb@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
alpha_mu_1 = zinb@alpha_mu[1, ]
alpha_mu_2 = zinb@alpha_mu[2, ]
alpha_pi_1 = zinb@alpha_pi[1, ]
alpha_pi_2 = zinb@alpha_pi[2, ]
df <- data.frame(alpha_mu_1 = alpha_mu_1,
alpha_mu_2 = alpha_mu_2,
alpha_pi_1 = alpha_pi_1,
alpha_pi_2 = alpha_pi_2)
pairs(df, pch=19, main = 'fitted alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.00000000 0.10127903 -0.39681655 -0.08947395
## alpha_mu_2 0.10127903 1.00000000 -0.06465126 -0.26197093
## alpha_pi_1 -0.39681655 -0.06465126 1.00000000 -0.02167753
## alpha_pi_2 -0.08947395 -0.26197093 -0.02167753 1.00000000
alpha_mu and alpha_pi are not correlated in this dataset. Let’s simulate \[\alpha_{\mu1} \sim .97 \mathcal{N}(0, 0.05) + .03 \mathcal{N}(0, 1)\] \[\alpha_{\mu2} \sim .97 \mathcal{N}(0, 0.1) + .03 \mathcal{N}(0, 1)\] \[\alpha_{\pi1} \sim .97 \mathcal{N}(0, 0.5) + .03 \mathcal{N}(0, 10)\] \[\alpha_{\pi2} \sim .97 \mathcal{N}(0, 0.5) + .03 \mathcal{N}(0, 10)\]
c(mean = mean(alpha_mu_1), sd = sd(alpha_mu_1))
## mean sd
## -0.07283686 0.18511472
c(mean = mean(alpha_mu_2), sd = sd(alpha_mu_2))
## mean sd
## 0.02986855 0.21500970
c(mean = mean(alpha_pi_1), sd = sd(alpha_pi_1))
## mean sd
## -0.3420094 2.3231649
c(mean = mean(alpha_pi_2), sd = sd(alpha_pi_2))
## mean sd
## 0.04426955 1.57533333
true_alpha_mu_1 = rmixtNorm(1000, 0, 0.05, 0, 1, p =.97)
true_alpha_mu_2 = rmixtNorm(1000, 0, 0.1, 0, 1, p =.97)
true_alpha_pi_1 = rmixtNorm(1000, 0, .5, 0, 10, p =.97)
true_alpha_pi_2 = rmixtNorm(1000, 0, .5, 0, 10, p =.97)
df <- data.frame(true_alpha_mu_1 = true_alpha_mu_1,
true_alpha_mu_2 = true_alpha_mu_2,
true_alpha_pi_1 = true_alpha_pi_1,
true_alpha_pi_2 = true_alpha_pi_2)
pairs(df, pch=19, main = 'simulated alpha')
print(cor(df, method="spearman"))
## true_alpha_mu_1 true_alpha_mu_2 true_alpha_pi_1
## true_alpha_mu_1 1.00000000 -0.05292706 0.032025440
## true_alpha_mu_2 -0.05292706 1.00000000 -0.068814057
## true_alpha_pi_1 0.03202544 -0.06881406 1.000000000
## true_alpha_pi_2 0.05185383 -0.03684648 0.005222261
## true_alpha_pi_2
## true_alpha_mu_1 0.051853828
## true_alpha_mu_2 -0.036846481
## true_alpha_pi_1 0.005222261
## true_alpha_pi_2 1.000000000
alpha = data.frame(true_alpha_mu_1 = true_alpha_mu_1,
true_alpha_mu_2 = true_alpha_mu_2,
true_alpha_pi_1 = true_alpha_pi_1,
true_alpha_pi_2 = true_alpha_pi_2,
alpha_mu_1 = alpha_mu_1,
alpha_mu_2 = alpha_mu_2,
alpha_pi_1 = alpha_pi_1,
alpha_pi_2 = alpha_pi_2)
alpha = melt(alpha)
alpha$status = sapply(alpha$variable, function(x) ifelse(grepl('true', x), 'simulated', 'fitted'))
alpha$variable = gsub('true_', '', alpha$variable)
ggplot(alpha, aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 50, col = 'gray') +
geom_density(col= 'blue', size = .5) +
facet_grid(status ~ variable, scales = 'free_x') + xlab('true_beta')
par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
xlab = 'edgeR tagwise dispersion', main = 'Dispersion')
print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.3745392
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion')
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion')
par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
par(mfrow = c(1, 2))
hist(zinb@zeta, xlab = 'fitted zeta', main = '')
c(mean = mean(zinb@zeta), sd = sd(zinb@zeta))
## mean sd
## -0.7551300 0.4551841
true_zeta = rnorm(1000, -.75, .45)
hist(true_zeta, breaks = 15, xlab = 'simulated zeta', main = '')
true_alpha_mu <- matrix(c(true_alpha_mu_1, true_alpha_mu_2),
nrow = 2, byrow = T)
true_alpha_pi <- matrix(c(true_alpha_pi_1, true_alpha_pi_2),
nrow = 2, byrow = T)
true_beta_mu <- matrix(true_beta_mu, nrow = 1)
true_beta_pi <- matrix(true_beta_pi, nrow = 1)
true_gamma_mu <- matrix(true_gamma_mu, nrow = 1)
true_gamma_pi <- matrix(true_gamma_mu, nrow = 1)
sim_obj <- zinbModel(W=true_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi,
beta_mu=true_beta_mu, beta_pi=true_beta_pi,
zeta = true_zeta, gamma_mu=true_gamma_mu,
gamma_pi=true_gamma_pi)
sim_data <- zinbSim(sim_obj, seed=3984)
sim_data$zeroFraction
## [1] 0.92123
sum(core==0)/(nrow(core) * ncol(core))
## [1] 0.34272
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
true_detection_rate <- rowMeans(sim_data$counts>0)
true_coverage <- rowSums(sim_data$counts)
par(mfrow=c(2, 2))
plot(rowMeans(getPi(zinb)), detection_rate,
xlab="Average fitted Pi", ylab="Detection Rate",
pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(zinb))), log1p(coverage), xlim = c(5, 7),
xlab="Average fitted log Mu", ylab="log Coverage", pch=19,
col=cols[bio])
plot(rowMeans(getPi(sim_obj)), true_detection_rate,
xlab="Average simulated Pi", ylab="True Detection Rate",
pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(sim_obj))), log1p(true_coverage), xlim = c(5, 7),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=cols[bio])
par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
xlab = "Mean Expression", main = 'Fitted',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
smoothScatter(colMeans(log1p(getMu(sim_obj))), colMeans(getPi(sim_obj)),
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8), nbin = 256,
xlab = "Mean Expression", main = 'Simulated',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
df <- data.frame(alpha_pi_1 = alpha_pi_1,
alpha_pi_2 = alpha_pi_2,
beta_pi = beta_pi)
pairs(df, pch=19, main = 'fitted')
print(cor(df, method="spearman"))
## alpha_pi_1 alpha_pi_2 beta_pi
## alpha_pi_1 1.00000000 -0.02167753 0.28375927
## alpha_pi_2 -0.02167753 1.00000000 -0.08034145
## beta_pi 0.28375927 -0.08034145 1.00000000
mod <- model.matrix(~ true_W)
zinb_sim <- zinbFit(core, ncores = 3, X=mod, which_X_mu=1:3, which_X_pi=1:3, commondispersion=FALSE)
true_alpha_mu <- zinb_sim@beta_mu[-1,]
true_alpha_pi <- zinb_sim@beta_pi[-1,]
true_beta_mu <- zinb_sim@beta_mu[1,,drop=FALSE]
true_beta_pi <- zinb_sim@beta_pi[1,,drop=FALSE]
true_gamma_mu <- zinb_sim@gamma_mu
true_gamma_pi <- zinb_sim@gamma_pi
sim_obj <- zinbModel(W=true_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi,
beta_mu=true_beta_mu, beta_pi=true_beta_pi,
zeta = true_zeta, gamma_mu=true_gamma_mu,
gamma_pi=true_gamma_pi)
sim_data <- zinbSim(sim_obj, seed=3984)
sim_data$zeroFraction
## [1] 0.3312
sum(core==0)/(nrow(core) * ncol(core))
## [1] 0.34272
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
true_detection_rate <- rowMeans(sim_data$counts>0)
true_coverage <- rowSums(sim_data$counts)
par(mfrow=c(2, 2))
plot(rowMeans(getPi(zinb)), detection_rate,
xlab="Average fitted Pi", ylab="Detection Rate",
pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(zinb))), log1p(coverage), xlim = c(5, 7),
xlab="Average fitted log Mu", ylab="log Coverage", pch=19,
col=cols[bio])
plot(rowMeans(getPi(sim_obj)), true_detection_rate,
xlab="Average simulated Pi", ylab="True Detection Rate",
pch=19, col=cols[bio], ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(sim_obj))), log1p(true_coverage), xlim = c(5, 7),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=cols[bio])
par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
xlab = "Mean Expression", main = 'Fitted',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
smoothScatter(colMeans(log1p(getMu(sim_obj))), colMeans(getPi(sim_obj)),
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8), nbin = 256,
xlab = "Mean Expression", main = 'Simulated',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
B <- 50
sim_data <- lapply(seq_len(B), function(i) zinbSim(sim_obj, seed=i))
save(sim_obj, sim_data, file="sim_allen_k2_noCorr.rda")